#check the IRR statistics for the intra cluster validation exercise

setwd("C:/Users/jonathan/Desktop/Nicholls CMM")

library(irr)

df <- read.csv('clusters-full coding.csv', sep=',')

#get rid of the singletons
dfagg <- data.frame(table(df$Cluster))
df2 <- merge(df, dfagg, by.x = 'Cluster', by.y = 'Var1')
df <- subset(df2, df2$Freq>1)
df <- droplevels(df)

#how many clusters did we sample?
nrow(data.frame(table(df$Cluster)))

df$JB.code <- as.numeric(df$JB.code)
df$TN.code <- as.numeric(df$TN.code)

#how many articles per cluster
df.sizes <- data.frame(table(df$Cluster))

#if switch == 1 there is a disagreement
df$switch <- df$JB.code + df$TN.code

#two broken links which are now not accessible
df <- subset(df, !is.na(df$JB.code))

table(df$switch)
round(prop.table(table(df$switch)), 2)

#use this to investigate disagreement between coders
disag <- df[df$switch==1,]


#IRR scores
df.codes <- df[c('JB.code','TN.code')]


agree(df.codes)
kripp.alpha(t(as.matrix(df.codes)))
